// Copyright 2020 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <xnnpack/assembly.h>

# void xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55(
#     size_t mr,                 x0
#     size_t nc,                 x1
#     size_t kc,                 x2 / x0
#     const int8_t* restrict a,  x3
#     size_t a_stride,           x4
#     const void* restrict w,    x5
#     int8_t* restrict c,        x6
#     size_t cm_stride,          x7
#     size_t cn_stride,          [sp] -> x12
#     const union xnn_qs8_gemm_params params)  [sp + 8] -> x11

# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.

# Register usage
# A0  x3  v0  v4
# A1 x15  v1  v5
# A2 x13  v2  v6
# A3  x4  v3  v7
# B   x5  v8  v9 v10 v11
# C0  x6 v16 v20 v24 v28
# C1  x8 v17 v21 v25 v29
# C2  x9 v18 v22 v26 v30
# C3  x7 v19 v23 v27 v31
# unused v12 v13 v14 v15

BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55

        # Clamp A and C pointers
        CMP      x0, 2             // if mr < 2
        STP      d8,  d9, [sp, -32]!
        ADD     x15, x3, x4        // a1 = a0 + a_stride
        ADD      x8, x6, x7        // c1 = c0 + cm_stride
        STP     d10, d11, [sp, 16]
        CSEL    x15, x3, x15, LO   //   a1 = a0
        CSEL     x8, x6,  x8, LO   //   c1 = c0
        ADD      x2, x2, 3         // kc = (kc + 3) & ~3

        ADD     x13, x15, x4       // a2 = a1 + a_stride
        ADD      x9,  x8, x7       // c2 = c1 + cm_stride
                                   // if mr <= 2
        CSEL    x13, x15, x13, LS  //   a2 = a1
        CSEL     x9,  x8,  x9, LS  //   c2 = c1
        BIC      x2, x2, 3

        CMP      x0, 4             // if mr < 4
        ADD      x4, x13, x4       // a3 = a2 + a_stride
        ADD      x7,  x9, x7       // c3 = c2 + cm_stride
        CSEL     x4, x13, x4, LO   //   a3 = a2
        CSEL     x7,  x9, x7, LO   //   c3 = c2

        .p2align 3
0:
        # Load initial bias from w into accumulators
        LDP     q16, q20, [x5], 32
        MOV     v17.16b, v16.16b
        MOV     v18.16b, v16.16b
        LDP     q24, q28, [x5], 32
        MOV     v19.16b, v16.16b
        MOV     v21.16b, v20.16b
        LDR     x11, [sp, 40]      // params
        MOV     v22.16b, v20.16b
        MOV     v23.16b, v20.16b
        MOV     v25.16b, v24.16b
        MOV     v26.16b, v24.16b
        MOV     v27.16b, v24.16b
        SUBS    x0, x2, 16         // k = kc - 16
        MOV     v29.16b, v28.16b
        MOV     v30.16b, v28.16b
        MOV     v31.16b, v28.16b
        # Is there at least 16 bytes for prologue/epilogue?
        B.LO    4f

        # prologue - read A and B values for block 0 and 1
        LDR      d0,  [x3], 8
        LDR      q8,  [x5], 16
        LDR      d1, [x15], 8
        LDR      d2, [x13], 8
        LDR      d3,  [x4], 8
        SUBS    x0, x0, 16         // is there 16 for main loop?
        LDR      d9,  [x5], 8
        LDR     x14,  [x5], 8
        # Is there at least 16 bytes for main loop?
        B.LO    2f

        # Main loop - 16 bytes of A in 4 groups.
        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
        # 4 LD64 for A
        # 4 LD128 for W. = 2 LD64 + INS.
        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.

        .p2align 3
1:
        // BLOCK 0
        SDOT    v16.4s,  v8.16b, v0.4b[0]
        LDR     d10,  [x5], 8
        SDOT    v17.4s,  v8.16b, v1.4b[0]
        INS      v9.d[1], x14
        SDOT    v18.4s,  v8.16b, v2.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v19.4s,  v8.16b, v3.4b[0]
        LDR      d4,  [x3], 8

        // BLOCK 1
        SDOT    v20.4s,  v9.16b, v0.4b[0]
        LDR     d11,  [x5], 8
        SDOT    v21.4s,  v9.16b, v1.4b[0]
        INS     v10.d[1], x14
        SDOT    v22.4s,  v9.16b, v2.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v23.4s,  v9.16b, v3.4b[0]
        LDR      d5, [x15], 8

        // BLOCK 2
        SDOT    v24.4s, v10.16b, v0.4b[0]
        LDR      d8,  [x5], 8
        SDOT    v25.4s, v10.16b, v1.4b[0]
        INS     v11.d[1], x14
        SDOT    v26.4s, v10.16b, v2.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v27.4s, v10.16b, v3.4b[0]
        LDR      d6, [x13], 8

        // BLOCK 3
        SDOT    v28.4s, v11.16b, v0.4b[0]
        LDR      d9,  [x5], 8
        SDOT    v29.4s, v11.16b, v1.4b[0]
        INS      v8.d[1], x14
        SDOT    v30.4s, v11.16b, v2.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v31.4s, v11.16b, v3.4b[0]
        LDR      d7,  [x4], 8

        // BLOCK 0
        SDOT    v16.4s,  v8.16b, v0.4b[1]
        LDR     d10,  [x5], 8
        SDOT    v17.4s,  v8.16b, v1.4b[1]
        INS      v9.d[1], x14
        SDOT    v18.4s,  v8.16b, v2.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v19.4s,  v8.16b, v3.4b[1]

        // BLOCK 1
        SDOT    v20.4s,  v9.16b, v0.4b[1]
        LDR     d11,  [x5], 8
        SDOT    v21.4s,  v9.16b, v1.4b[1]
        INS     v10.d[1], x14
        SDOT    v22.4s,  v9.16b, v2.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v23.4s,  v9.16b, v3.4b[1]

        // BLOCK 2
        SDOT    v24.4s, v10.16b, v0.4b[1]
        LDR      d8,  [x5], 8
        SDOT    v25.4s, v10.16b, v1.4b[1]
        INS     v11.d[1], x14
        SDOT    v26.4s, v10.16b, v2.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v27.4s, v10.16b, v3.4b[1]

        // BLOCK 4
        SDOT    v28.4s, v11.16b, v0.4b[1]
        LDR      d9,  [x5], 8
        SDOT    v29.4s, v11.16b, v1.4b[1]
        INS      v8.d[1], x14
        SDOT    v30.4s, v11.16b, v2.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v31.4s, v11.16b, v3.4b[1]

        // BLOCK 0
        SDOT    v16.4s,  v8.16b, v4.4b[0]
        LDR     d10,  [x5], 8
        SDOT    v17.4s,  v8.16b, v5.4b[0]
        INS      v9.d[1], x14
        SDOT    v18.4s,  v8.16b, v6.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v19.4s,  v8.16b, v7.4b[0]
        LDR      d0,  [x3], 8

        // BLOCK 1
        SDOT    v20.4s,  v9.16b, v4.4b[0]
        LDR     d11,  [x5], 8
        SDOT    v21.4s,  v9.16b, v5.4b[0]
        INS     v10.d[1], x14
        SDOT    v22.4s,  v9.16b, v6.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v23.4s,  v9.16b, v7.4b[0]
        LDR      d1, [x15], 8

        // BLOCK 2
        SDOT    v24.4s, v10.16b, v4.4b[0]
        LDR      d8,  [x5], 8
        SDOT    v25.4s, v10.16b, v5.4b[0]
        INS     v11.d[1], x14
        SDOT    v26.4s, v10.16b, v6.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v27.4s, v10.16b, v7.4b[0]
        LDR      d2, [x13], 8

        // BLOCK 3
        SDOT    v28.4s, v11.16b, v4.4b[0]
        LDR      d9,  [x5], 8
        SDOT    v29.4s, v11.16b, v5.4b[0]
        INS      v8.d[1], x14
        SDOT    v30.4s, v11.16b, v6.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v31.4s, v11.16b, v7.4b[0]
        LDR      d3,  [x4], 8

        // BLOCK 0
        SDOT    v16.4s,  v8.16b, v4.4b[1]
        LDR     d10,  [x5], 8
        SDOT    v17.4s,  v8.16b, v5.4b[1]
        INS      v9.d[1], x14
        SDOT    v18.4s,  v8.16b, v6.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v19.4s,  v8.16b, v7.4b[1]

        // BLOCK 1
        SDOT    v20.4s,  v9.16b, v4.4b[1]
        LDR     d11,  [x5], 8
        SDOT    v21.4s,  v9.16b, v5.4b[1]
        INS     v10.d[1], x14
        SDOT    v22.4s,  v9.16b, v6.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v23.4s,  v9.16b, v7.4b[1]

        // BLOCK 2
        SDOT    v24.4s, v10.16b, v4.4b[1]
        LDR      d8,  [x5], 8   // First B values for block 0 and 1
        SDOT    v25.4s, v10.16b, v5.4b[1]
        INS     v11.d[1], x14
        SDOT    v26.4s, v10.16b, v6.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v27.4s, v10.16b, v7.4b[1]
        SUBS    x0, x0, 16

        // BLOCK 3
        SDOT    v28.4s, v11.16b, v4.4b[1]
        LDR      d9,  [x5], 8
        SDOT    v29.4s, v11.16b, v5.4b[1]
        INS      v8.d[1], x14
        SDOT    v30.4s, v11.16b, v6.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v31.4s, v11.16b, v7.4b[1]
        B.HS    1b

        # Epilogue.  Same as main loop but no preloads in final group
2:
        // BLOCK 0
        SDOT    v16.4s,  v8.16b, v0.4b[0]
        LDR     d10,  [x5], 8
        SDOT    v17.4s,  v8.16b, v1.4b[0]
        INS      v9.d[1], x14
        SDOT    v18.4s,  v8.16b, v2.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v19.4s,  v8.16b, v3.4b[0]
        LDR      d4,  [x3], 8

        // BLOCK 1
        SDOT    v20.4s,  v9.16b, v0.4b[0]
        LDR     d11,  [x5], 8
        SDOT    v21.4s,  v9.16b, v1.4b[0]
        INS     v10.d[1], x14
        SDOT    v22.4s,  v9.16b, v2.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v23.4s,  v9.16b, v3.4b[0]
        LDR      d5, [x15], 8

        // BLOCK 2
        SDOT    v24.4s, v10.16b, v0.4b[0]
        LDR      d8,  [x5], 8
        SDOT    v25.4s, v10.16b, v1.4b[0]
        INS     v11.d[1], x14
        SDOT    v26.4s, v10.16b, v2.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v27.4s, v10.16b, v3.4b[0]
        LDR      d6, [x13], 8

        // BLOCK 3
        SDOT    v28.4s, v11.16b, v0.4b[0]
        LDR      d9,  [x5], 8
        SDOT    v29.4s, v11.16b, v1.4b[0]
        INS      v8.d[1], x14
        SDOT    v30.4s, v11.16b, v2.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v31.4s, v11.16b, v3.4b[0]
        LDR      d7,  [x4], 8

        // BLOCK 0
        SDOT    v16.4s,  v8.16b, v0.4b[1]
        LDR     d10,  [x5], 8
        SDOT    v17.4s,  v8.16b, v1.4b[1]
        INS      v9.d[1], x14
        SDOT    v18.4s,  v8.16b, v2.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v19.4s,  v8.16b, v3.4b[1]

        // BLOCK 1
        SDOT    v20.4s,  v9.16b, v0.4b[1]
        LDR     d11,  [x5], 8
        SDOT    v21.4s,  v9.16b, v1.4b[1]
        INS     v10.d[1], x14
        SDOT    v22.4s,  v9.16b, v2.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v23.4s,  v9.16b, v3.4b[1]

        // BLOCK 2
        SDOT    v24.4s, v10.16b, v0.4b[1]
        LDR      d8,  [x5], 8
        SDOT    v25.4s, v10.16b, v1.4b[1]
        INS     v11.d[1], x14
        SDOT    v26.4s, v10.16b, v2.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v27.4s, v10.16b, v3.4b[1]

        // BLOCK 4
        SDOT    v28.4s, v11.16b, v0.4b[1]
        LDR      d9,  [x5], 8
        SDOT    v29.4s, v11.16b, v1.4b[1]
        INS      v8.d[1], x14
        SDOT    v30.4s, v11.16b, v2.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v31.4s, v11.16b, v3.4b[1]

        // BLOCK 0
        SDOT    v16.4s,  v8.16b, v4.4b[0]
        LDR     d10,  [x5], 8
        SDOT    v17.4s,  v8.16b, v5.4b[0]
        INS      v9.d[1], x14
        SDOT    v18.4s,  v8.16b, v6.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v19.4s,  v8.16b, v7.4b[0]

        // BLOCK 1
        SDOT    v20.4s,  v9.16b, v4.4b[0]
        LDR     d11,  [x5], 8
        SDOT    v21.4s,  v9.16b, v5.4b[0]
        INS     v10.d[1], x14
        SDOT    v22.4s,  v9.16b, v6.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v23.4s,  v9.16b, v7.4b[0]

        // BLOCK 2
        SDOT    v24.4s, v10.16b, v4.4b[0]
        LDR      d8,  [x5], 8
        SDOT    v25.4s, v10.16b, v5.4b[0]
        INS     v11.d[1], x14
        SDOT    v26.4s, v10.16b, v6.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v27.4s, v10.16b, v7.4b[0]

        // BLOCK 3
        SDOT    v28.4s, v11.16b, v4.4b[0]
        LDR      d9,  [x5], 8
        SDOT    v29.4s, v11.16b, v5.4b[0]
        INS      v8.d[1], x14
        SDOT    v30.4s, v11.16b, v6.4b[0]
        LDR     x14,  [x5], 8
        SDOT    v31.4s, v11.16b, v7.4b[0]

        // BLOCK 0
        SDOT    v16.4s,  v8.16b, v4.4b[1]
        LDR     d10,  [x5], 8
        SDOT    v17.4s,  v8.16b, v5.4b[1]
        INS      v9.d[1], x14
        SDOT    v18.4s,  v8.16b, v6.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v19.4s,  v8.16b, v7.4b[1]

        // BLOCK 1
        SDOT    v20.4s,  v9.16b, v4.4b[1]
        LDR     d11,  [x5], 8
        SDOT    v21.4s,  v9.16b, v5.4b[1]
        INS     v10.d[1], x14
        SDOT    v22.4s,  v9.16b, v6.4b[1]
        LDR     x14,  [x5], 8
        SDOT    v23.4s,  v9.16b, v7.4b[1]

        // BLOCK 2
        SDOT    v24.4s, v10.16b, v4.4b[1]
        SDOT    v25.4s, v10.16b, v5.4b[1]
        INS     v11.d[1], x14
        SDOT    v26.4s, v10.16b, v6.4b[1]
        SDOT    v27.4s, v10.16b, v7.4b[1]
        AND     x0, x2, 15        // kc remainder 0 to 12

        // BLOCK 3
        SDOT    v28.4s, v11.16b, v4.4b[1]
        SDOT    v29.4s, v11.16b, v5.4b[1]
        SDOT    v30.4s, v11.16b, v6.4b[1]
        SDOT    v31.4s, v11.16b, v7.4b[1]

        # Is there a remainder?- 4 to 12 bytes of A
        CBNZ    x0, 5f

        .p2align 3
3:
        # Apply params - scale, shift, bias and clamp
        LD1R    {v0.4s}, [x11], 4
        SQRDMULH     v4.4s, v16.4s, v0.4s
        SQRDMULH     v5.4s, v17.4s, v0.4s
        LD1R    {v1.4s}, [x11], 4
        SQRDMULH     v6.4s, v18.4s, v0.4s
        SQRDMULH     v7.4s, v19.4s, v0.4s
        SQRDMULH     v8.4s, v20.4s, v0.4s
        SQRDMULH     v9.4s, v21.4s, v0.4s
        CMEQ    v2.4s, v1.4s, 0
        SQRDMULH    v10.4s, v22.4s, v0.4s
        SQRDMULH    v11.4s, v23.4s, v0.4s

        BIC     v16.16b, v16.16b, v2.16b
        BIC     v17.16b, v17.16b, v2.16b
        BIC     v18.16b, v18.16b, v2.16b
        BIC     v19.16b, v19.16b, v2.16b
        BIC     v20.16b, v20.16b, v2.16b
        BIC     v21.16b, v21.16b, v2.16b
        BIC     v22.16b, v22.16b, v2.16b
        BIC     v23.16b, v23.16b, v2.16b

        SSRA     v4.4s, v16.4s, 31  // signed shift right accumulate
        SSRA     v5.4s, v17.4s, 31
        SSRA     v6.4s, v18.4s, 31
        SSRA     v7.4s, v19.4s, 31
        SSRA     v8.4s, v20.4s, 31
        SSRA     v9.4s, v21.4s, 31
        SSRA    v10.4s, v22.4s, 31
        SSRA    v11.4s, v23.4s, 31

        SQRDMULH  v16.4s, v24.4s, v0.4s
        SQRDMULH  v17.4s, v25.4s, v0.4s
        SQRDMULH  v18.4s, v26.4s, v0.4s
        SQRDMULH  v19.4s, v27.4s, v0.4s
        SQRDMULH  v20.4s, v28.4s, v0.4s
        SQRDMULH  v21.4s, v29.4s, v0.4s
        SQRDMULH  v22.4s, v30.4s, v0.4s
        SQRDMULH  v23.4s, v31.4s, v0.4s

        BIC     v24.16b, v24.16b, v2.16b
        BIC     v25.16b, v25.16b, v2.16b
        BIC     v26.16b, v26.16b, v2.16b
        BIC     v27.16b, v27.16b, v2.16b
        BIC     v28.16b, v28.16b, v2.16b
        BIC     v29.16b, v29.16b, v2.16b
        BIC     v30.16b, v30.16b, v2.16b
        BIC     v31.16b, v31.16b, v2.16b

        SSRA    v16.4s, v24.4s, 31
        SSRA    v17.4s, v25.4s, 31
        SSRA    v18.4s, v26.4s, 31
        SSRA    v19.4s, v27.4s, 31
        SSRA    v20.4s, v28.4s, 31
        SSRA    v21.4s, v29.4s, 31
        SSRA    v22.4s, v30.4s, 31
        SSRA    v23.4s, v31.4s, 31

        SRSHL    v4.4s,  v4.4s, v1.4s  // signed rounding shift left
        SRSHL    v5.4s,  v5.4s, v1.4s
        SRSHL    v6.4s,  v6.4s, v1.4s
        SRSHL    v7.4s,  v7.4s, v1.4s
        SRSHL    v8.4s,  v8.4s, v1.4s
        SRSHL    v9.4s,  v9.4s, v1.4s
        SRSHL   v10.4s, v10.4s, v1.4s
        SRSHL   v11.4s, v11.4s, v1.4s

        SRSHL   v16.4s, v16.4s, v1.4s
        SRSHL   v17.4s, v17.4s, v1.4s
        SRSHL   v18.4s, v18.4s, v1.4s
        SRSHL   v19.4s, v19.4s, v1.4s
        SRSHL   v20.4s, v20.4s, v1.4s
        SRSHL   v21.4s, v21.4s, v1.4s
        SRSHL   v22.4s, v22.4s, v1.4s
        SRSHL   v23.4s, v23.4s, v1.4s

        SQXTN    v4.4h,  v4.4s
        SQXTN    v5.4h,  v5.4s
        SQXTN    v6.4h,  v6.4s
        SQXTN    v7.4h,  v7.4s
        SQXTN   v16.4h, v16.4s
        SQXTN   v17.4h, v17.4s
        SQXTN   v18.4h, v18.4s
        SQXTN   v19.4h, v19.4s
        LD1R    {v2.8h}, [x11], 2   // add bias

        SQXTN2   v4.8h,  v8.4s
        SQXTN2   v5.8h,  v9.4s
        SQXTN2   v6.8h, v10.4s
        SQXTN2   v7.8h, v11.4s
        SQXTN2  v16.8h, v20.4s
        SQXTN2  v17.8h, v21.4s
        SQXTN2  v18.8h, v22.4s
        SQXTN2  v19.8h, v23.4s

        SQADD    v4.8h,  v4.8h, v2.8h
        SQADD    v5.8h,  v5.8h, v2.8h
        SQADD    v6.8h,  v6.8h, v2.8h
        SQADD    v7.8h,  v7.8h, v2.8h
        SQADD   v16.8h, v16.8h, v2.8h
        SQADD   v17.8h, v17.8h, v2.8h
        SQADD   v18.8h, v18.8h, v2.8h
        SQADD   v19.8h, v19.8h, v2.8h
        LD1R    {v0.16b}, [x11], 1  // clamp min value

        SQXTN    v4.8b,  v4.8h
        SQXTN    v5.8b,  v5.8h
        SQXTN    v6.8b,  v6.8h
        SQXTN    v7.8b,  v7.8h
        LD1R    {v1.16b}, [x11]     // clamp max value
        SQXTN2   v4.16b, v16.8h
        SQXTN2   v5.16b, v17.8h
        SQXTN2   v6.16b, v18.8h
        SQXTN2   v7.16b, v19.8h
        LDR     x12, [sp, 32]   // cn_stride

        SMAX     v4.16b,  v4.16b, v0.16b
        SMAX     v5.16b,  v5.16b, v0.16b
        SMAX     v6.16b,  v6.16b, v0.16b
        SMAX     v7.16b,  v7.16b, v0.16b
        SUBS    x1, x1, 16
        SMIN     v4.16b,  v4.16b, v1.16b
        SMIN     v5.16b,  v5.16b, v1.16b
        SMIN     v6.16b,  v6.16b, v1.16b
        SMIN     v7.16b,  v7.16b, v1.16b
        B.LO    6f

        # Store full 4 x 16
        ST1     {v4.16b}, [x6], x12
        SUB      x3,  x3, x2         // a0 -= kc
        ST1     {v5.16b}, [x8], x12
        SUB     x15, x15, x2         // a1 -= kc
        ST1     {v6.16b}, [x9], x12
        SUB     x13, x13, x2         // a2 -= kc
        ST1     {v7.16b}, [x7], x12
        SUB      x4,  x4, x2         // a3 -= kc
        B.NE    0b

        LDP     d10, d11, [sp, 16]
        LDP      d8,  d9, [sp], 32
        RET

        # Remainder- 4 to 12 bytes of A
        # Although C4, its safe to read 16 bytes.
        .p2align 3
4:
        AND     x0, x2, 15        // kc remainder 4 to 12
5:
        LDP      q8,  q9,  [x5], 32
        LDP     q10, q11,  [x5], 32
        LD1     {v0.16b},  [x3], x0
        LD1     {v1.16b}, [x15], x0
        LD1     {v2.16b}, [x13], x0
        LD1     {v3.16b},  [x4], x0
        SDOT    v16.4s,  v8.16b, v0.4b[0]
        SDOT    v17.4s,  v8.16b, v1.4b[0]
        SDOT    v18.4s,  v8.16b, v2.4b[0]
        SDOT    v19.4s,  v8.16b, v3.4b[0]
        SDOT    v20.4s,  v9.16b, v0.4b[0]
        SDOT    v21.4s,  v9.16b, v1.4b[0]
        SDOT    v22.4s,  v9.16b, v2.4b[0]
        SDOT    v23.4s,  v9.16b, v3.4b[0]
        SDOT    v24.4s, v10.16b, v0.4b[0]
        SDOT    v25.4s, v10.16b, v1.4b[0]
        SDOT    v26.4s, v10.16b, v2.4b[0]
        SDOT    v27.4s, v10.16b, v3.4b[0]
        SDOT    v28.4s, v11.16b, v0.4b[0]
        SDOT    v29.4s, v11.16b, v1.4b[0]
        SDOT    v30.4s, v11.16b, v2.4b[0]
        SDOT    v31.4s, v11.16b, v3.4b[0]
        CMP     x0, 4
        B.LS    3b
        LDP      q8,  q9,  [x5], 32
        LDP     q10, q11,  [x5], 32
        SDOT    v16.4s,  v8.16b, v0.4b[1]
        SDOT    v17.4s,  v8.16b, v1.4b[1]
        SDOT    v18.4s,  v8.16b, v2.4b[1]
        SDOT    v19.4s,  v8.16b, v3.4b[1]
        SDOT    v20.4s,  v9.16b, v0.4b[1]
        SDOT    v21.4s,  v9.16b, v1.4b[1]
        SDOT    v22.4s,  v9.16b, v2.4b[1]
        SDOT    v23.4s,  v9.16b, v3.4b[1]
        SDOT    v24.4s, v10.16b, v0.4b[1]
        SDOT    v25.4s, v10.16b, v1.4b[1]
        SDOT    v26.4s, v10.16b, v2.4b[1]
        SDOT    v27.4s, v10.16b, v3.4b[1]
        SDOT    v28.4s, v11.16b, v0.4b[1]
        SDOT    v29.4s, v11.16b, v1.4b[1]
        SDOT    v30.4s, v11.16b, v2.4b[1]
        SDOT    v31.4s, v11.16b, v3.4b[1]
        CMP     x0, 8
        B.LS    3b
        LDP       q8,  q9,  [x5], 32
        LDP      q10, q11,  [x5], 32
        SDOT    v16.4s,  v8.16b, v0.4b[2]
        SDOT    v17.4s,  v8.16b, v1.4b[2]
        SDOT    v18.4s,  v8.16b, v2.4b[2]
        SDOT    v19.4s,  v8.16b, v3.4b[2]
        SDOT    v20.4s,  v9.16b, v0.4b[2]
        SDOT    v21.4s,  v9.16b, v1.4b[2]
        SDOT    v22.4s,  v9.16b, v2.4b[2]
        SDOT    v23.4s,  v9.16b, v3.4b[2]
        SDOT    v24.4s, v10.16b, v0.4b[2]
        SDOT    v25.4s, v10.16b, v1.4b[2]
        SDOT    v26.4s, v10.16b, v2.4b[2]
        SDOT    v27.4s, v10.16b, v3.4b[2]
        SDOT    v28.4s, v11.16b, v0.4b[2]
        SDOT    v29.4s, v11.16b, v1.4b[2]
        SDOT    v30.4s, v11.16b, v2.4b[2]
        SDOT    v31.4s, v11.16b, v3.4b[2]
        B       3b

        # Store odd width
        .p2align 3
6:
        TBZ     x1, 3, 7f
        STR     d4, [x6], 8
        DUP     d4, v4.d[1]
        STR     d5, [x8], 8
        DUP     d5, v5.d[1]
        STR     d6, [x9], 8
        DUP     d6, v6.d[1]
        STR     d7, [x7], 8
        DUP     d7, v7.d[1]
7:
        TBZ     x1, 2, 8f
        STR     s4, [x6], 4
        DUP     s4, v4.s[1]
        STR     s5, [x8], 4
        DUP     s5, v5.s[1]
        STR     s6, [x9], 4
        DUP     s6, v6.s[1]
        STR     s7, [x7], 4
        DUP     s7, v7.s[1]
8:
        TBZ     x1, 1, 9f
        ST1     {v4.h}[0], [x6], 2
        DUP      h4, v4.h[1]
        ST1     {v5.h}[0], [x8], 2
        DUP      h5, v5.h[1]
        ST1     {v6.h}[0], [x9], 2
        DUP      h6, v6.h[1]
        ST1     {v7.h}[0], [x7], 2
        DUP      h7, v7.h[1]
9:
        TBZ     x1, 0, 10f
        ST1     {v4.b}[0], [x6]
        ST1     {v5.b}[0], [x8]
        ST1     {v6.b}[0], [x9]
        ST1     {v7.b}[0], [x7]
10:
        LDP     d10, d11, [sp, 16]
        LDP      d8,  d9, [sp], 32
        RET

END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55

#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
#endif
